import warnings
warnings.filterwarnings("ignore")
# Libraries to help with reading and manipulating data
import pandas as pd
import numpy as np
# Library to split data
from sklearn.model_selection import train_test_split
# Libraries to help with model building
from sklearn.linear_model import LogisticRegression
# Libaries to help with data visualization
import matplotlib.pyplot as plt
import seaborn as sns
# Removes the limit from the number of displayed columns and rows.
pd.set_option("display.max_columns", None)
# pd.set_option('display.max_rows', None)
pd.set_option("display.max_rows", 200)
import plotly.express as ex
import plotly.graph_objs as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
import plotly.offline as pyo
# for statistical analysis
import statsmodels.stats.api as sms
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
# To help with reading and manipulating data
import pandas as pd
import numpy as np
# To help with data visualization
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
# To be used for missing value imputation
from sklearn.impute import SimpleImputer
# To help with model building
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (
AdaBoostClassifier,
GradientBoostingClassifier,
RandomForestClassifier,
BaggingClassifier,
)
from xgboost import XGBClassifier
# To get different metric scores, and split data
from sklearn import metrics
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import (
f1_score,
accuracy_score,
recall_score,
precision_score,
confusion_matrix,
roc_auc_score,
plot_confusion_matrix,
)
# To be used for data scaling and one hot encoding
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
# To be used for tuning the model
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
# To be used for creating pipelines and personalizing them
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
# To define maximum number of columns to be displayed in a dataframe
pd.set_option("display.max_columns", None)
# To supress scientific notations for a dataframe
pd.set_option("display.float_format", lambda x: "%.3f" % x)
# To supress warnings
import warnings
warnings.filterwarnings("ignore")
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
pip install plotly
data = pd.read_csv('BankChurners.csv')
#data = data[data.columns[:-2]]
data.head(5) # Check first 5 rows
data.tail(5) # Check last 5 rows
fig = make_subplots(rows=2, cols=1)
i1=go.Box(x=data['Customer_Age'],name='Age Box Plot',boxmean=True)
i2=go.Histogram(x=data['Customer_Age'],name='Age Histogram')
fig.add_trace(i1,row=1,col=1)
fig.add_trace(i2,row=2,col=1)
fig.update_layout(height=700, width=1200, title_text="Distribution of Customer Ages")
fig.show()
fig = make_subplots(
rows=2, cols=2,subplot_titles=('','<b>Platinum Card Holders','<b>Blue Card Holders<b>','Residuals'),
vertical_spacing=0.09,
specs=[[{"type": "pie","rowspan": 2} ,{"type": "pie"}] ,
[None ,{"type": "pie"}] ,
]
)
fig.add_trace(
go.Pie(values=data.Gender.value_counts().values,labels=['<b>Female<b>','<b>Male<b>'],hole=0.3,pull=[0,0.3]),
row=1, col=1
)
fig.add_trace(
go.Pie(
labels=['Female Platinum Card Holders','Male Platinum Card Holders'],
values=data.query('Card_Category=="Platinum"').Gender.value_counts().values,
pull=[0,0.05,0.5],
hole=0.3
),
row=1, col=2
)
fig.add_trace(
go.Pie(
labels=['Female Blue Card Holders','Male Blue Card Holders'],
values=data.query('Card_Category=="Blue"').Gender.value_counts().values,
pull=[0,0.2,0.5],
hole=0.3
),
row=2, col=2
)
fig.update_layout(
height=800,
showlegend=True,
title_text="<b>Distribution Of Gender And Different Card Statuses<b>",
)
fig.show()
fig = make_subplots(rows=2, cols=1)
tr1=go.Box(x=data['Dependent_count'],name='Dependent count Box Plot',boxmean=True)
tr2=go.Histogram(x=data['Dependent_count'],name='Dependent count Histogram')
fig.add_trace(tr1,row=1,col=1)
fig.add_trace(tr2,row=2,col=1)
fig.update_layout(height=700, width=1200, title_text="Distribution of Dependent counts (close family size)")
fig.show()
ex.pie(data,names='Education_Level',title='Propotion Of Education Levels',hole=0.33)
ex.pie(data,names='Marital_Status',title='Propotion Of Different Marriage Statuses',hole=0.33)
ex.pie(data,names='Income_Category',title='Propotion Of Different Income Levels',hole=0.33)
ex.pie(data,names='Card_Category',title='Propotion Of Different Card Categories',hole=0.33)
fig = make_subplots(rows=2, cols=1)
tr1=go.Box(x=data['Months_on_book'],name='Months on book Box Plot',boxmean=True)
tr2=go.Histogram(x=data['Months_on_book'],name='Months on book Histogram')
fig.add_trace(tr1,row=1,col=1)
fig.add_trace(tr2,row=2,col=1)
fig.update_layout(height=700, width=1200, title_text="Distribution of months the customer is part of the bank")
fig.show()
print('Kurtosis of Months on book features is : {}'.format(data['Months_on_book'].kurt()))
fig = make_subplots(rows=2, cols=1)
tr1=go.Box(x=data['Total_Relationship_Count'],name='Total no. of products Box Plot',boxmean=True)
tr2=go.Histogram(x=data['Total_Relationship_Count'],name='Total no. of products Histogram')
fig.add_trace(tr1,row=1,col=1)
fig.add_trace(tr2,row=2,col=1)
fig.update_layout(height=700, width=1200, title_text="Distribution of Total no. of products held by the customer")
fig.show()
fig = make_subplots(rows=2, cols=1)
tr1=go.Box(x=data['Months_Inactive_12_mon'],name='number of months inactive Box Plot',boxmean=True)
tr2=go.Histogram(x=data['Months_Inactive_12_mon'],name='number of months inactive Histogram')
fig.add_trace(tr1,row=1,col=1)
fig.add_trace(tr2,row=2,col=1)
fig.update_layout(height=700, width=1200, title_text="Distribution of the number of months inactive in the last 12 months")
fig.show()
fig = make_subplots(rows=2, cols=1)
tr1=go.Box(x=data['Credit_Limit'],name='Credit_Limit Box Plot',boxmean=True)
tr2=go.Histogram(x=data['Credit_Limit'],name='Credit_Limit Histogram')
fig.add_trace(tr1,row=1,col=1)
fig.add_trace(tr2,row=2,col=1)
fig.update_layout(height=700, width=1200, title_text="Distribution of the Credit Limit")
fig.show()
fig = make_subplots(rows=2, cols=1)
tr1=go.Box(x=data['Total_Trans_Amt'],name='Total_Trans_Amt Box Plot',boxmean=True)
tr2=go.Histogram(x=data['Total_Trans_Amt'],name='Total_Trans_Amt Histogram')
fig.add_trace(tr1,row=1,col=1)
fig.add_trace(tr2,row=2,col=1)
fig.update_layout(height=700, width=1200, title_text="Distribution of the Total Transaction Amount (Last 12 months)")
fig.show()
ex.pie(data,names='Attrition_Flag',title='Proportion of churn vs not churn customers',hole=0.33)
data
sns.pairplot(data, hue="Attrition_Flag")
plt.figure(figsize=(15,7))
sns.heatmap(data.corr(),annot=True,vmin=-1,vmax=1,fmt='.2f',cmap='Spectral')
plt.show()
sns.barplot(x=data['Attrition_Flag'],y=data['Credit_Limit'], hue= data['Gender'])
### Function to plot distributions and Boxplots of customers
def plot(x,target='Attrition_Flag'):
fig,axs = plt.subplots(2,2,figsize=(12,10))
axs[0, 0].set_title(f'Distribution of {x} of customer who attrited',fontsize=12,fontweight='bold')
sns.distplot(data[(data[target] == 1)][x],ax=axs[0,0],color='teal')
axs[0, 1].set_title(f"Distribution of {x} of customer who didn't attrite",fontsize=12,fontweight='bold')
sns.distplot(data[(data[target] == 0)][x],ax=axs[0,1],color='orange')
axs[1,0].set_title(f'Boxplot of {x} w.r.t ProductTaken',fontsize=12,fontweight='bold')
line = plt.Line2D((.1,.9),(.5,.5), color='grey', linewidth=1.5,linestyle='--')
fig.add_artist(line)
sns.boxplot(data[target],data[x],ax=axs[1,0],palette='gist_rainbow',showmeans=True)
axs[1,1].set_title(f'Boxplot of {x} w.r.t Personal Loan - Without outliers',fontsize=12,fontweight='bold')
sns.boxplot(data[target],data[x],ax=axs[1,1],showfliers=False,palette='gist_rainbow',showmeans=True) #turning off outliers from boxplot
plt.tight_layout(pad=4)
plt.show()
sns.barplot(x=data['Attrition_Flag'],y=data['Total_Trans_Ct'], hue= data['Gender'])
sns.barplot(x=data['Attrition_Flag'],y=data['Total_Trans_Amt'], hue= data['Gender'])
sns.barplot(x=data['Attrition_Flag'],y=data['Total_Ct_Chng_Q4_Q1'], hue= data['Gender'])
sns.barplot(x=data['Attrition_Flag'],y=data['Total_Amt_Chng_Q4_Q1'], hue= data['Gender'])
sns.barplot(x=data['Attrition_Flag'],y=data['Total_Revolving_Bal'], hue= data['Gender'])
plot('Customer_Age')
data.Attrition_Flag = data.Attrition_Flag.replace({'Attrited Customer':1,'Existing Customer':0}) # replacing values of attrition flag
data.Gender = data.Gender.replace({'F':1,'M':0}) # replacing values of Gender
data.isnull().sum() # Finding sum of all null values across all variables
data.info() # to check for data types of various variables
data.nunique() # To check for unique values across various variables
data = data.drop(['CLIENTNUM'], axis=1) # Let Drop Customer Id Column
data= data.dropna(axis=0, how='any') # Drop any other null values
data['Attrition_Flag'] = data['Attrition_Flag'].astype('category')
data['Gender'] = data['Gender'].astype('category')
data.info()
data = pd.get_dummies(data, columns=['Education_Level','Marital_Status','Income_Category','Card_Category','Gender']) #create dummy variables
data.info()
data.isnull().sum()
df = data.copy() # Let us copy data into dataframe
X = df.drop(["Attrition_Flag"], axis=1) #drop the dependent variable
y = df["Attrition_Flag"] # defining dependent variable
# Splitting data into training, validation and test sets:
# first we split data into 2 parts, say temporary and test
X_temp, X_test, y_temp, y_test = train_test_split(
X, y, test_size=0.2, random_state=1, stratify=y
)
# then we split the temporary set into train and validation
X_train, X_val, y_train, y_val = train_test_split(
X_temp, y_temp, test_size=0.25, random_state=1, stratify=y_temp
)
print(X_train.shape, X_val.shape, X_test.shape)
models = [] # Empty list to store all the models
# Appending models into the list
models.append(("Bagging", BaggingClassifier(random_state=1)))
models.append(("Random forest", RandomForestClassifier(random_state=1)))
models.append(("GBM", GradientBoostingClassifier(random_state=1)))
models.append(("Adaboost", AdaBoostClassifier(random_state=1)))
models.append(("Xgboost", XGBClassifier(random_state=1, eval_metric="logloss")))
models.append(("dtree", DecisionTreeClassifier(random_state=1)))
results = [] # Empty list to store all model's CV scores
names = [] # Empty list to store name of the models
score = []
# loop through all models to get the mean cross validated score
print("\n" "Cross-Validation Performance:" "\n")
for name, model in models:
scoring = "recall"
kfold = StratifiedKFold(
n_splits=5, shuffle=True, random_state=1
) # Setting number of splits equal to 5
cv_result = cross_val_score(
estimator=model, X=X_train, y=y_train, scoring=scoring, cv=kfold
)
results.append(cv_result)
names.append(name)
print("{}: {}".format(name, cv_result.mean() * 100))
print("\n" "Validation Performance:" "\n")
for name, model in models:
model.fit(X_train, y_train)
scores = recall_score(y_val, model.predict(X_val))
score.append(scores)
print("{}: {}".format(name, scores))
# Plotting boxplots for CV scores of all models defined above
fig = plt.figure()
fig.suptitle("Algorithm Comparison")
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()
# defining a function to compute different metrics to check performance of a classification model built using sklearn
def model_performance_classification_sklearn(model, predictors, target):
"""
Function to compute different metrics to check classification model performance
model: classifier
predictors: independent variables
target: dependent variable
"""
# predicting using the independent variables
pred = model.predict(predictors)
acc = accuracy_score(target, pred) # to compute Accuracy
recall = recall_score(target, pred) # to compute Recall
precision = precision_score(target, pred) # to compute Precision
f1 = f1_score(target, pred) # to compute F1-score
# creating a dataframe of metrics
df_perf = pd.DataFrame(
{
"Accuracy": acc,
"Recall": recall,
"Precision": precision,
"F1": f1,
},
index=[0],
)
return df_perf
def confusion_matrix_sklearn(model, predictors, target):
"""
To plot the confusion_matrix with percentages
model: classifier
predictors: independent variables
target: dependent variable
"""
y_pred = model.predict(predictors)
cm = confusion_matrix(target, y_pred)
labels = np.asarray(
[
["{0:0.0f}".format(item) + "\n{0:.2%}".format(item / cm.flatten().sum())]
for item in cm.flatten()
]
).reshape(2, 2)
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=labels, fmt="")
plt.ylabel("True label")
plt.xlabel("Predicted label")
lr = LogisticRegression(random_state=1)
lr.fit(X_train, y_train)
Let's evaluate the model performance by using KFold and cross_val_score
K-Folds cross-validation provides dataset indices to split data into train/validation sets. Split dataset into k consecutive stratified folds (without shuffling by default). Each fold is then used once as validation while the k - 1 remaining folds form the training set.
scoring = "recall"
kfold = StratifiedKFold(
n_splits=5, shuffle=True, random_state=1
) # Setting number of splits equal to 5
cv_result_bfr = cross_val_score(
estimator=lr, X=X_train, y=y_train, scoring=scoring, cv=kfold
)
# Plotting boxplots for CV scores of model defined above
plt.boxplot(cv_result_bfr)
plt.show()
# Calculating different metrics on train set
log_reg_model_train_perf = model_performance_classification_sklearn(
lr, X_train, y_train
)
print("Training performance:")
log_reg_model_train_perf
# Calculating different metrics on validation set
log_reg_model_val_perf = model_performance_classification_sklearn(lr, X_val, y_val)
print("Validation performance:")
log_reg_model_val_perf
# creating confusion matrix
confusion_matrix_sklearn(lr, X_val, y_val)
print("Before UpSampling, counts of label 'Yes': {}".format(sum(y_train == 1)))
print("Before UpSampling, counts of label 'No': {} \n".format(sum(y_train == 0)))
sm = SMOTE(
sampling_strategy=1, k_neighbors=5, random_state=1
) # Synthetic Minority Over Sampling Technique
X_train_over, y_train_over = sm.fit_resample(X_train, y_train)
print("After UpSampling, counts of label 'Yes': {}".format(sum(y_train_over == 1)))
print("After UpSampling, counts of label 'No': {} \n".format(sum(y_train_over == 0)))
log_reg_over = LogisticRegression(random_state=1)
# Training the basic logistic regression model with training set
log_reg_over.fit(X_train_over, y_train_over)
scoring = "recall"
kfold = StratifiedKFold(
n_splits=5, shuffle=True, random_state=1
) # Setting number of splits equal to 5
cv_result_over = cross_val_score(
estimator=log_reg_over, X=X_train_over, y=y_train_over, scoring=scoring, cv=kfold
)
# Plotting boxplots for CV scores of model defined above
plt.boxplot(cv_result_over)
plt.show()
# Calculating different metrics on train set
log_reg_over_train_perf = model_performance_classification_sklearn(
log_reg_over, X_train_over, y_train_over
)
print("Training performance:")
log_reg_over_train_perf
# Calculating different metrics on validation set
log_reg_over_val_perf = model_performance_classification_sklearn(
log_reg_over, X_val, y_val
)
print("validation performance:")
log_reg_over_val_perf
# creating confusion matrix
confusion_matrix_sklearn(log_reg_over, X_val, y_val)
rus = RandomUnderSampler(random_state=1)
X_train_un, y_train_un = rus.fit_resample(X_train, y_train)
print("Before Under Sampling, counts of label 'Yes': {}".format(sum(y_train == 1)))
print("Before Under Sampling, counts of label 'No': {} \n".format(sum(y_train == 0)))
print("After Under Sampling, counts of label 'Yes': {}".format(sum(y_train_un == 1)))
print("After Under Sampling, counts of label 'No': {} \n".format(sum(y_train_un == 0)))
print("After Under Sampling, the shape of train_X: {}".format(X_train_un.shape))
print("After Under Sampling, the shape of train_y: {} \n".format(y_train_un.shape))
log_reg_under = LogisticRegression(random_state=1)
log_reg_under.fit(X_train_un, y_train_un)
scoring = "recall"
kfold = StratifiedKFold(
n_splits=5, shuffle=True, random_state=1
) # Setting number of splits equal to 5
cv_result_under = cross_val_score(
estimator=log_reg_under, X=X_train_un, y=y_train_un, scoring=scoring, cv=kfold
)
# Plotting boxplots for CV scores of model defined above
plt.boxplot(cv_result_under)
plt.show()
# Calculating different metrics on train set
log_reg_under_train_perf = model_performance_classification_sklearn(
log_reg_under, X_train_un, y_train_un
)
print("Training performance:")
log_reg_under_train_perf
# Calculating different metrics on validation set
log_reg_under_val_perf = model_performance_classification_sklearn(
log_reg_under, X_val, y_val
)
print("Validation performance:")
log_reg_under_val_perf
# creating confusion matrix
confusion_matrix_sklearn(log_reg_under, X_val, y_val)
# Choose the type of classifier.
lr_estimator = LogisticRegression(random_state=1, solver="saga")
# Grid of parameters to choose from
parameters = {"C": np.arange(0.1, 1.1, 0.1)}
# Run the grid search
grid_obj = GridSearchCV(lr_estimator, parameters, scoring="recall")
grid_obj = grid_obj.fit(X_train_over, y_train_over)
# Set the clf to the best combination of parameters
lr_estimator = grid_obj.best_estimator_
# Fit the best algorithm to the data.
lr_estimator.fit(X_train_over, y_train_over)
# Calculating different metrics on train set
log_reg_reg_train_perf = model_performance_classification_sklearn(
lr_estimator, X_train_over, y_train_over
)
print("Training performance:")
log_reg_reg_train_perf
# Calculating different metrics on validation set
log_reg_reg_val_perf = model_performance_classification_sklearn(
lr_estimator, X_val, y_val
)
print("Validation performance:")
log_reg_reg_val_perf
# creating confusion matrix
confusion_matrix_sklearn(lr_estimator, X_val, y_val)
# Creating pipeline
model = DecisionTreeClassifier(random_state=1)
# Parameter grid to pass in GridSearchCV
param_grid = {
"criterion": ["gini", "entropy"],
"max_depth": [3, 4, 5, None],
"min_samples_split": [2, 4, 7, 10, 15],
}
# Type of scoring used to compare parameter combinations
scorer = metrics.make_scorer(metrics.recall_score)
# Calling GridSearchCV
grid_cv = GridSearchCV(estimator=model, param_grid=param_grid, scoring=scorer, cv=5)
# Fitting parameters in GridSeachCV
grid_cv.fit(X_train, y_train)
print(
"Best Parameters:{} \nScore: {}".format(grid_cv.best_params_, grid_cv.best_score_)
)
# Creating new pipeline with best parameters
dtree_tuned1 = DecisionTreeClassifier(
random_state=1, criterion="entropy", max_depth=None, min_samples_split=2
)
# Fit the model on training data
dtree_tuned1.fit(X_train, y_train)
# Calculating different metrics on train set
dtree_grid_train = model_performance_classification_sklearn(
dtree_tuned1, X_train, y_train
)
print("Training performance:")
dtree_grid_train
# Calculating different metrics on validation set
dtree_grid_val = model_performance_classification_sklearn(dtree_tuned1, X_val, y_val)
print("Validation performance:")
dtree_grid_val
# creating confusion matrix
confusion_matrix_sklearn(dtree_tuned1, X_val, y_val)
# Creating pipeline
model = DecisionTreeClassifier(random_state=1)
# Parameter grid to pass in RandomizedSearchCV
param_grid = {
"criterion": ["gini", "entropy"],
"max_depth": [3, 4, 5, None],
"min_samples_split": [2, 4, 7, 10, 15],
}
# Type of scoring used to compare parameter combinations
scorer = metrics.make_scorer(metrics.recall_score)
# Calling RandomizedSearchCV
randomized_cv = RandomizedSearchCV(
estimator=model,
param_distributions=param_grid,
n_iter=20,
scoring=scorer,
cv=5,
random_state=1,
)
# Fitting parameters in RandomizedSearchCV
randomized_cv.fit(X_train, y_train)
print(
"Best parameters are {} with CV score={}:".format(
randomized_cv.best_params_, randomized_cv.best_score_
)
)
# Creating new pipeline with best parameters
dtree_tuned2 = DecisionTreeClassifier(
random_state=1, criterion="entropy", max_depth=None, min_samples_split=2
)
# Fit the model on training data
dtree_tuned2.fit(X_train, y_train)
# Calculating different metrics on train set
dtree_random_train = model_performance_classification_sklearn(
dtree_tuned2, X_train, y_train
)
print("Training performance:")
dtree_random_train
# Calculating different metrics on validation set
dtree_random_val = model_performance_classification_sklearn(dtree_tuned2, X_val, y_val)
print("Validation performance:")
dtree_random_val
# creating confusion matrix
confusion_matrix_sklearn(dtree_tuned1, X_val, y_val)
%%time
#defining model
model = XGBClassifier(random_state=1,eval_metric='logloss')
#Parameter grid to pass in GridSearchCV
param_grid={'n_estimators':np.arange(50,150,50),
'scale_pos_weight':[2,5,10],
'learning_rate':[0.01,0.1,0.2,0.05],
'gamma':[0,1,3,5],
'subsample':[0.8,0.9,1],
'max_depth':np.arange(1,5,1),
'reg_lambda':[5,10]}
# Type of scoring used to compare parameter combinations
scorer = metrics.make_scorer(metrics.recall_score)
#Calling GridSearchCV
grid_cv = GridSearchCV(estimator=model, param_grid=param_grid, scoring=scorer, cv=5, n_jobs = -1, verbose= 2)
#Fitting parameters in GridSeachCV
grid_cv.fit(X_train,y_train)
print("Best parameters are {} with CV score={}:" .format(grid_cv.best_params_,grid_cv.best_score_))
# building model with best parameters
xgb_tuned1 = XGBClassifier(
random_state=1,
n_estimators=50,
scale_pos_weight=10,
subsample=0.9,
learning_rate=0.01,
gamma=0,
eval_metric="logloss",
reg_lambda=5,
max_depth=2,
)
# Fit the model on training data
xgb_tuned1.fit(X_train, y_train)
# Calculating different metrics on train set
xgboost_grid_train = model_performance_classification_sklearn(
xgb_tuned1, X_train, y_train
)
print("Training performance:")
xgboost_grid_train
# Calculating different metrics on validation set
xgboost_grid_val = model_performance_classification_sklearn(xgb_tuned1, X_val, y_val)
print("Validation performance:")
xgboost_grid_val
# creating confusion matrix
confusion_matrix_sklearn(xgb_tuned1, X_val, y_val)
%%time
# defining model
model = XGBClassifier(random_state=1,eval_metric='logloss')
# Parameter grid to pass in RandomizedSearchCV
param_grid={'n_estimators':np.arange(50,150,50),
'scale_pos_weight':[2,5,10],
'learning_rate':[0.01,0.1,0.2,0.05],
'gamma':[0,1,3,5],
'subsample':[0.8,0.9,1],
'max_depth':np.arange(1,5,1),
'reg_lambda':[5,10]}
# Type of scoring used to compare parameter combinations
scorer = metrics.make_scorer(metrics.recall_score)
#Calling RandomizedSearchCV
xgb_tuned2 = RandomizedSearchCV(estimator=model, param_distributions=param_grid, n_iter=50, scoring=scorer, cv=5, random_state=1, n_jobs = -1)
#Fitting parameters in RandomizedSearchCV
xgb_tuned2.fit(X_train,y_train)
print("Best parameters are {} with CV score={}:" .format(xgb_tuned2.best_params_,xgb_tuned2.best_score_))
# building model with best parameters
xgb_tuned2 = XGBClassifier(
random_state=1,
n_estimators=50,
scale_pos_weight=10,
gamma=0,
subsample=0.8,
learning_rate=0.1,
eval_metric="logloss",
max_depth=2,
reg_lambda=10,
)
# Fit the model on training data
xgb_tuned2.fit(X_train, y_train)
# Calculating different metrics on train set
xgboost_random_train = model_performance_classification_sklearn(
xgb_tuned2, X_train, y_train
)
print("Training performance:")
xgboost_random_train
# Calculating different metrics on validation set
xgboost_random_val = model_performance_classification_sklearn(xgb_tuned2, X_val, y_val)
print("Validation performance:")
xgboost_random_val
# creating confusion matrix
confusion_matrix_sklearn(xgb_tuned2, X_val, y_val)
# training performance comparison
models_train_comp_df = pd.concat(
[
log_reg_model_train_perf.T,
log_reg_over_train_perf.T,
log_reg_reg_train_perf.T,
log_reg_under_train_perf.T,
dtree_grid_train.T,
dtree_random_train.T,
xgboost_grid_train.T,
xgboost_random_train.T,
],
axis=1,
)
models_train_comp_df.columns = [
"Logistic Regression",
"Logistic Regression with oversampled data",
"Regularised Logistic Regression",
"Logistic Regression with undersampled data",
"Decision Tree Tuned with Grid search",
"Decision Tree Tuned with Random search",
"Xgboost Tuned with Grid search",
"Xgboost Tuned with Random Search",
]
print("Training performance comparison:")
models_train_comp_df
# Validation performance comparison
models_val_comp_df = pd.concat(
[
log_reg_model_val_perf.T,
log_reg_over_val_perf.T,
log_reg_reg_val_perf.T,
log_reg_under_val_perf.T,
dtree_grid_val.T,
dtree_random_val.T,
xgboost_grid_val.T,
xgboost_random_val.T,
],
axis=1,
)
models_val_comp_df.columns = [
"Logistic Regression",
"Logistic Regression with oversampled data",
"Regularised Logistic Regression",
"Logistic Regression with undersampled data",
"Decision Tree Tuned with Grid search",
"Decision Tree Tuned with Random search",
"Xgboost Tuned with Grid search",
"Xgboost Tuned with Random Search",
]
print("Validation performance comparison:")
models_val_comp_df
feature_names = X_train.columns
importances = xgb_tuned1.feature_importances_
indices = np.argsort(importances)
plt.figure(figsize=(12, 12))
plt.title("Feature Importances")
plt.barh(range(len(indices)), importances[indices], color="violet", align="center")
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.xlabel("Relative Importance")
plt.show()
data.info()
# creating a list of numerical variables
numerical_features = ["Total_Trans_Ct", "Total_Trans_Amt", "Total_Ct_Chng_Q4_Q1", "Total_Revolving_Bal", "Total_Amt_Chng_Q4_Q1","Education_Level","Mariatal"]
# creating a transformer for numerical variables, which will apply simple imputer on the numerical variables
numeric_transformer = Pipeline(steps=[("imputer", SimpleImputer(strategy="median"))])
# handle_unknown = "ignore", allows model to handle any unknown category in the test data
# combining categorical transformer and numerical transformer using a column transformer
preprocessor = ColumnTransformer(
transformers=[
("num", numeric_transformer, numerical_features),
],
remainder="passthrough",
)
# remainder = "passthrough" has been used, it will allow variables that are present in original data
# but not in "numerical_columns" and "categorical_columns" to pass through the column transformer without any changes
# Separating target variable and other variables
X = data.drop("Attrition_Flag", axis=1)
Y = data["Attrition_Flag"]
# Splitting the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
X, Y, test_size=0.30, random_state=1, stratify=Y
)
print(X_train.shape, X_test.shape)
# Creating new pipeline with best parameters
model = Pipeline(
steps=[
("pre", preprocessor),
(
"XGB",
XGBClassifier(
random_state=1,
n_estimators=50,
scale_pos_weight=10,
subsample=0.8,
learning_rate=0.1,
gamma=0,
eval_metric="logloss",
reg_lambda=10,
max_depth=2,
),
),
]
)
# Fit the model on training data
model.fit(X_train, y_train)